#!/usr/bin/env python
# encoding: utf-8
"""
@author: youfeng
@email: youfeng243@163.com
@license: Apache Licence
@file: change_page_store_struct.py
@time: 2018/1/1 16:40
"""

import sys

sys.path.append('..')
# 网页库
from common.mongo import MongDb
from logger import Logger

MONGO_DB_TARGET = {
    "host": "172.16.215.2",
    "port": 40042,
    "db": "crawl_data_new",
    "username": "work",
    "password": "haizhi",
}

# 日志模块
log = Logger('change_page_store_struct.log').get_logger()

target_db = MongDb(MONGO_DB_TARGET['host'], MONGO_DB_TARGET['port'], MONGO_DB_TARGET['db'],
                   MONGO_DB_TARGET['username'],
                   MONGO_DB_TARGET['password'], log=log)


def main():
    table_name = 'new_gsxt_search_page'
    result_list = []
    count = 0
    process_count = 0
    for item in target_db.traverse(table_name, {}):

        count += 1
        text = item.get('text')
        if not isinstance(text, basestring):
            continue

        item['text'] = [text]
        result_list.append(item)
        process_count += 1

        if len(result_list) >= 500:
            target_db.insert_batch_data(table_name, result_list)
            del result_list[:]

        if count % 1000 == 0:
            log.info("当前进度: count = {} process = {}".format(count, process_count))

    if len(result_list) > 0:
        target_db.insert_batch_data(table_name, result_list)

    log.info("完成数据格式调整...")


if __name__ == '__main__':
    main()
